#!/usr/bin/python

import sys
from copy import deepcopy
from lxml.html import parse, tostring

################################################################
### main program
################################################################

if len(sys.argv)<2:
    sys.stderr.write("combine multiple hOCR documents into one\n\n")
    sys.stderr.write("usage: %s file1.html file2.html...\n"%sys.argv[0])
    sys.exit(1)

stream = open(sys.argv[1])
doc = parse(sys.argv[1])

pages = doc.getroot().find_class("ocr_page")
container = pages[-1].getparent()

for fname in sys.argv[2:]:
    doc2 = parse(fname).getroot()
    pages = doc2.find_class("ocr_page")
    for page in pages:
        container.append(deepcopy(page))

print tostring(doc, pretty_print=True)
